In [15]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import sys
import json
import plotly.figure_factory as ff
import plotly.express as px
sys.tracebacklimit = 0 # turn off the error tracebacks

Problem 1 - Data Extraction¶

  • A: Get Beautiful Soup object of half - marathons within 200 miles of virginia beach

  • B: Build a function to scrape date, race_title, city, and race distances from the website

  • C: Build For loop to turn scraping function into a spider for races further into the future

  • D: Save the data for each loop as a df and

In [109]:
# Part 1A: Get BeautifulSoup object of running in the usa page 1 
r = requests.get('https://httpbin.org/user-agent')
useragent = json.loads(r.text)['user-agent']
headers = {'User-Agent': useragent,
           'from': 'vrd9sd@virginia.edu'}

url = 'https://runningintheusa.com/classic/list/within-200-miles-of-virginia%20beach-va/upcoming/half-marathon/miles-between-250/page-1'
r = requests.get(url, headers=headers)

# Parsing HTML code 
mysoup = BeautifulSoup(r.text, 'html.parser')
In [110]:
# part 1B: function to scrape races from the running in the usa website
def race_df(url):
    """ function for returning df w/ race info given a url
    Args:
        url: (str) a string of a race 
    Returns:
        races: (df) a DataFrame containg race dates, titles, cities
    """

    r = requests.get(url, headers=headers)
    mysoup = BeautifulSoup(r.text, 'html.parser')
 
    titles = [x.b for x in mysoup.find_all('td', attrs = {'style': 'text-decoration:inherit;'})]
    titles = [x.string for i, x in enumerate(titles) if i%2==1]

    rowspan_1_2_cities = [x.b for x in mysoup.find_all('td', attrs={'rowspan':['1', '2']})]
    cities = rowspan_1_2_cities[2:]
    cities = [x.string for i,x in enumerate(cities) if i%4==0] #and i > 7]

    dates = [x.string for x in mysoup.find_all('div', attrs = {'style':"font-weight:bold"})]
    dates = [x for i,x in enumerate(dates) if i>1 and i<= (len(cities) + 1)]

    distances = [x.string for x in mysoup.find_all('div', attrs={'style':"padding-left:10px"}) if x.string and (x.string.endswith('run') or x.string.endswith('relay'))]
   
    if len(dates) == len(titles) == len(cities) ==len(distances): # ensure I am getting each record (lenths should be the same)
        # Part 1D: Makes races df, use indexing to feature out featured listings at top
        races = pd.DataFrame({
        'date' : dates[2:], 
        'race': titles[2:],
        'city': cities[2:], 
        'distance': distances[2:]})
    else:
        races = -1 
    return races
new_df = race_df(url)

# Part 1C: Turn this web-scraper into a spider!
for i in range(2, 7):
    url = url[:-1] + str(i) # Insert new number into url str for pages 2-6
    new_df = pd.concat([new_df, race_df(url)], ignore_index=True) # use scraping function as spider
In [101]:
new_df 
Out[101]:
date race city distance
0 Dec 14, 2024 Pocahontas Half Chesterfield, VA 13.1M, 10K, 5K run
1 Dec 15, 2024 Holiday Half Marathon Annandale, VA 13.1M, 4M run
2 Dec 21, 2024 Naptown Half Marathon Annapolis, MD 13.1M, 10K, 5K run
3 Dec 21, 2024 Oakwood 24 Raleigh, NC 24H, 12H, 50M, 50K, 26.2M, 13.1M, 5K run
4 Dec 21, 2024 Rudolph's Race 5K & Half Washington, DC 13.1M, 5K run
... ... ... ... ...
115 Apr 26, 2025 Spring Third Winchester Battlefield 5K, 10K, &... Winchester, VA 13.1M, 10K, 5K trail run
116 Apr 26, 2025 Wake Forest Historic Half Wake Forest, NC 13.1M run
117 Apr 27, 2025 Blackbeard's Half Marathon Ocracoke, NC 13.1M run
118 Apr 27, 2025 Runners Half Marathon of Reston Reston, VA 13.1M, 5K run
119 May 3, 2025 Neuse River Bridge Run New Bern, NC 13.1M, 10K, 5K run

120 rows × 4 columns

Problem 2 - Data cleaning¶

  • A: Convert the race dates to datetime objects

  • B: Add a day of the week column for races

  • C: Order the Columns in Race DF to be aesthetically pleasing

  • D: Save the race data to a csv

In [111]:
new_df['date'] = pd.to_datetime(new_df.date) # Part 2A
new_df['day'] = new_df['date'].dt.day_name() # Part 2B
new_df = new_df[['day', 'date', 'race', 'city', 'distance']] # Part 2C
In [73]:
csv_file_path = 'half_marathon_data.csv'
import os
if not os.path.exists(csv_file_path):
    new_df.to_csv(csv_file_path, index=False) # Part 2D

Problem 3 - EDA¶

  • A: Show how many races featuring a half marathon each city within 200 miles of va beach has on running the USA site

  • B: Make a table of all the races

  • C: Show how many races occur on each date

  • D: Identify the races that dont occur on saturdays and sundays

In [74]:
# Problem 3A:
city_race_count = new_df.groupby(['cities']).agg({'races':'count'}).sort_values(by='races', ascending=False)   
city_race_count = city_race_count.reset_index(drop=False).rename({'cities': 'city'}, axis=1)
In [75]:
city_race_count.head()
Out[75]:
city races
0 Washington, DC 10
1 Colonial Heights, VA 7
2 Williamsburg, VA 5
3 Raleigh, NC 5
4 Cary, NC 3
In [112]:
# Problem 3B
table=ff.create_table(new_df)
# table
In [115]:
# Problem 3C
date_race_count = new_df.groupby('date').agg({'race':'count'})
date_race_count = date_race_count.reset_index()
date_race_count.head()
Out[115]:
date race
0 2024-10-05 6
1 2024-10-06 4
2 2024-10-12 5
3 2024-10-13 1
4 2024-10-19 7
In [116]:
# Part D problem
new_df.query("day != 'Saturday' & day!= 'Sunday'")
Out[116]:
day date race city distance
52 Monday 2024-11-11 Service and Sacrifice 5K & Half Washington, DC 13.1M, 5K run
63 Thursday 2024-11-28 Pie Gobbler 1M, 5K, 10K, 15K, and Half Marathon Williamsburg, VA 13.1M, 15K, 10K, 5K, 1M run
64 Thursday 2024-11-28 Skinny Turkey Half Marathon Raleigh, NC 13.1M, 10K, 5K run | kids run
87 Wednesday 2025-01-01 New Year Day 1M, 5K, 10K, 15K, and Half Marathon Williamsburg, VA 13.1M, 15K, 10K, 5K, 1M run

Problem 4 - Data Vizualization¶

  • A: Create a table of months of year, counts of races and day

  • B: Create an interactive barplot that shows the counts of half-marathons by day as well as month they occur in

  • C: Create line chart of races containing a half marathon by month

In [120]:
# Part A: Table of race county by month and day
date_count = date_df.groupby(['day', 'month'])['race'].count().reset_index()
date_count.head()
Out[120]:
day month race
0 Monday 11 1
1 Saturday 1 4
2 Saturday 2 3
3 Saturday 3 4
4 Saturday 4 8
In [163]:
# Part B show Races by day of week 
import plotly.offline as pyo
pyo.init_notebook_mode()

day_count = new_df['day'].value_counts().reset_index()
day_count.columns = ['day', 'count']

fig = px.bar(day_count, x='day', y='count',  
             text='count',
             title = 'number of races w/ half marathons within 200 miles of Virginia Beach categorized by day',
             color='day',
             labels = {'count': 'number of races'})

fig.show()
In [164]:
# Part 4C: Create Line Chart 
new_df['year_month'] = new_df['date'].dt.to_period('M') # make new column for year month

monthly_race_count = new_df.groupby('year_month').size().reset_index(name='count') # count races by year_month
monthly_race_count['year_month'] = monthly_race_count['year_month'].dt.to_timestamp() # timestamp year months

fig = px.line(monthly_race_count, x='year_month', y='count',  # plot line chart 
              title='Number of Races with a half-marathon within 200 miles of VB by Month',
              labels={'year_month': 'Month', 'count': 'Count of Races'})

fig.show()
In [177]:
# Import the required library
# from geopy.geocoders import Nominatim
# geolocator = Nominatim(user_agent="myGeocoder for student project")
# lat = geolocator.geocode("Cary, NC")
Out[177]:
'Washington'
In [168]:
# Assuming you have a latitude and longitude for each city
# fig = px.scatter_geo(new_df, locations="city", hover_name="race", title="Races by Location")
# fig.show()
In [ ]: